── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks plotly::filter(), stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)library(zoo)
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
1. Read in the data
## data extracted from New York Times state-level data from NYT Github repository# https://github.com/nytimes/covid-19-data## state-level population information from us_census_data available on GitHub repository:# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data# load COVID state-level data from NYTcv_states <-read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")str(cv_states)
'data.frame': 61942 obs. of 5 variables:
$ date : chr "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" ...
$ state : chr "Washington" "Washington" "Washington" "Illinois" ...
$ fips : int 53 53 53 17 53 6 17 53 4 6 ...
$ cases : int 1 1 1 1 1 1 1 1 1 2 ...
$ deaths: int 0 0 0 0 0 0 0 0 0 0 ...
table(cv_states$state)
Alabama Alaska American Samoa
1106 1107 548
Arizona Arkansas California
1153 1108 1154
Colorado Connecticut Delaware
1114 1111 1108
District of Columbia Florida Georgia
1112 1118 1117
Guam Hawaii Idaho
1104 1113 1106
Illinois Indiana Iowa
1155 1113 1111
Kansas Kentucky Louisiana
1112 1113 1110
Maine Maryland Massachusetts
1107 1114 1147
Michigan Minnesota Mississippi
1109 1113 1108
Missouri Montana Nebraska
1112 1106 1131
Nevada New Hampshire New Jersey
1114 1117 1115
New Mexico New York North Carolina
1108 1118 1116
North Dakota Northern Mariana Islands Ohio
1108 1091 1110
Oklahoma Oregon Pennsylvania
1113 1120 1113
Puerto Rico Rhode Island South Carolina
1106 1118 1113
South Dakota Tennessee Texas
1109 1114 1136
Utah Vermont Virgin Islands
1123 1112 1105
Virginia Washington West Virginia
1112 1158 1102
Wisconsin Wyoming
1143 1108
#Non-states = Virgin Islands, Puerto Rico, Northern Mariana Islands, DC, Guam, American Samoa# load state population datastate_pops <-read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv")str(state_pops)
Alabama Alaska Arizona
1 1 1
Arkansas California Colorado
1 1 1
Connecticut Delaware District of Columbia
1 1 1
Florida Georgia Hawaii
1 1 1
Idaho Illinois Indiana
1 1 1
Iowa Kansas Kentucky
1 1 1
Louisiana Maine Maryland
1 1 1
Massachusetts Michigan Minnesota
1 1 1
Mississippi Missouri Montana
1 1 1
Nebraska Nevada New Hampshire
1 1 1
New Jersey New Mexico New York
1 1 1
North Carolina North Dakota Ohio
1 1 1
Oklahoma Oregon Pennsylvania
1 1 1
Puerto Rico Rhode Island South Carolina
1 1 1
South Dakota Tennessee Texas
1 1 1
Utah Vermont Virginia
1 1 1
Washington West Virginia Wisconsin
1 1 1
Wyoming
1
#Non-states = DC, Puerto Ricocv_states <-merge(cv_states, state_pops, by="state")
2. Look at the data
Inspect the dimensions, head, and tail of the data
Inspect the structure of each variables. Are they in the correct format?
We had two datasets with 5 variables each. Once we merged them, we have 9 variables, which is expected. We lost observations from US Virgin Islands, Northern Mariana Islands, Guam, and American Samoa when we merged (~10000).
dim(cv_states)
[1] 58094 9
head(cv_states)
state date fips cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04 1 1587224 21263 1 4887871 96.50939 AL
2 Alabama 2020-04-25 1 6213 213 1 4887871 96.50939 AL
3 Alabama 2023-02-26 1 1638348 21400 1 4887871 96.50939 AL
4 Alabama 2022-12-03 1 1549285 21129 1 4887871 96.50939 AL
5 Alabama 2020-05-06 1 8691 343 1 4887871 96.50939 AL
6 Alabama 2021-04-21 1 524367 10807 1 4887871 96.50939 AL
Inspect the range values for each variable. What is the date range? The range of cases and deaths?
The date range is January 21, 2020 to March 23, 2023. The range of cases is 1-12 Million (!), and the range of deaths is 0-104,277. There are some missing values for population density.
# format the datecv_states$date <-as.Date(cv_states$date, format="%Y-%m-%d")# format the state and state abbreviation (abb) variablesstate_list <-unique(cv_states$state)cv_states$state <-factor(cv_states$state, levels = state_list)abb_list <-unique(cv_states$abb)cv_states$abb <-factor(cv_states$abb, levels = abb_list)# order the data first by state, second by datecv_states <- cv_states %>%arrange(state, date)# Confirm the variables are now correctly formattedstr(cv_states)
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?head(cv_states)
state date fips cases deaths geo_id population pop_density abb
1 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
2 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
3 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
4 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
5 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
6 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
state date fips cases
Washington : 1158 Min. :2020-01-21 Min. : 1.00 Min. : 1
Illinois : 1155 1st Qu.:2020-12-06 1st Qu.:16.00 1st Qu.: 112125
California : 1154 Median :2021-09-11 Median :29.00 Median : 418120
Arizona : 1153 Mean :2021-09-10 Mean :29.78 Mean : 947941
Massachusetts: 1147 3rd Qu.:2022-06-17 3rd Qu.:44.00 3rd Qu.: 1134318
Wisconsin : 1143 Max. :2023-03-23 Max. :72.00 Max. :12169158
(Other) :51184
deaths geo_id population pop_density
Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
1st Qu.: 1598 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
Median : 5901 Median :29.00 Median : 4468402 Median : 107.860
Mean : 12553 Mean :29.78 Mean : 6397965 Mean : 423.031
3rd Qu.: 15952 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
Max. :104277 Max. :72.00 Max. :39557045 Max. :11490.120
NA's :1106
abb
WA : 1158
IL : 1155
CA : 1154
AZ : 1153
MA : 1147
WI : 1143
(Other):51184
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"
4. Add new_cases and new_deaths and correct outliers
Add variables for new cases, new_cases, and new deaths, new_deaths: (Hint: You can set new_cases equal to the difference between cases on date i and date i-1, starting on date i=2). Filter to dates after June 1, 2021. Use plotly for EDA: See if there are outliers or values that don't make sense for new_cases and new_deaths. Which states and which dates have strange values?
In California there were huge spikes in new cases on 9/3/21, 9/16/21, 5/15/21, 7/4/22, 11/11/22, 1/22/223.
In Florida, there were spikes in new cases on 7/29/22 and 2/18/23.
In Texas, there was a spike in new cases on 11/29/22.
There are some negative new case and new deaths in multiple states on multiple dates, the most startling number being -3770 new deaths in MA on 10/08/22, and -598 deaths in Ohio on 12/16/22.
Correct outliers: Set negative values for new_cases or new_deaths to 0. Recalculate cases and deaths as cumulative sum of updated new_cases and new_deaths.Get the rolling average of new cases and new deaths to smooth over time. Inspect data again interactively.
New deaths make a lot more sense now – no negative values. Still see spikes in deaths in various states that may represent batched reporting of deaths?
# Add variables for new_cases and new_deaths:for (i in1:length(state_list)) { cv_subset =subset(cv_states, state == state_list[i]) cv_subset = cv_subset[order(cv_subset$date),]# add starting level for new cases and deaths cv_subset$new_cases = cv_subset$cases[1] cv_subset$new_deaths = cv_subset$deaths[1]##FINISHfor (j in2:nrow(cv_subset)) { cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1] cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1] }# include in main dataset cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths}# Focus on recent datescv_states <- cv_states %>% dplyr::filter(date >="2021-06-01")### FINISH# Inspect outliers in new_cases using plotlyp1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +geom_line() +geom_point(size = .5, alpha =0.5)ggplotly(p1)
p1<-NULL# to clear from workspacep2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +geom_line() +geom_point(size = .5, alpha =0.5)ggplotly(p2)
p2<-NULL# to clear from workspace# set negative new case or death counts to 0cv_states$new_cases[cv_states$new_cases<0] =0cv_states$new_deaths[cv_states$new_deaths<0] =0# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`for (i in1:length(state_list)) { cv_subset =subset(cv_states, state == state_list[i])# add starting level for new cases and deaths cv_subset$cases = cv_subset$cases[1] cv_subset$deaths = cv_subset$deaths[1]### FINISHfor (j in2:nrow(cv_subset)) { cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$new_cases[j-1] cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$new_deaths[j-1] }# include in main dataset cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths}# Smooth new countscv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>%round(digits =0)cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>%round(digits =0)# Inspect data again interactivelyp2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +geom_line() +geom_point(size = .5, alpha =0.5)ggplotly(p2)
p2=NULL
5. Add additional variables
Add population-normalized (by 100,000) variables for each variable type (rounded to 1 decimal place). Make sure the variables you calculate are in the correct format (numeric). You can use the following variable names:
per100k = cases per 100,000 population
newper100k= new cases per 100,000
deathsper100k = deaths per 100,000
newdeathsper100k = new deaths per 100,000
Add a "naive CFR" variable representing deaths / cases on each date for each state. Create a dataframe representing values on the most recent date, cv_states_today, as done in lecture.
### FINISH# add population normalized (by 100,000) counts for each variablecv_states$per100k =as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))cv_states$newper100k =as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / casescv_states = cv_states %>%mutate(naive_CFR =round((deaths*100/cases),2))# create a `cv_states_today` variablecv_states_today =subset(cv_states, date==max(cv_states$date))
6. Explore scatterplots using plot_ly()
Create a scatterplot using plot_ly() representing pop_density vs. various variables (e.g. cases, per100k, deaths, deathsper100k) for each state on most recent date (cv_states_today)
Color points by state and size points by state population. Use hover to identify any outliers. Remove those outliers and replot. Choose one plot. Add hoverinfo specifying the state name, cases per 100k, and deaths per 100k, similarly to how we did this in the lecture notes. Add layout information to title the chart and the axes. Enable hovermode = "compare"
### FINISH CODE HERE# pop_density vs. deathscv_states_today %>%plot_ly(x =~pop_density, y =~deaths, type ='scatter', mode ='markers', color =~state,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# filter out "District of Columbia"cv_states_today_filter <- cv_states_today %>%filter(state!="District of Columbia")# pop_density vs. cases after filteringcv_states_today_filter %>%plot_ly(x =~pop_density, y =~deaths, type ='scatter', mode ='markers', color =~state,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# pop_density vs. deathsper100kcv_states_today_filter %>%plot_ly(x =~pop_density, y =~deathsper100k,type ='scatter', mode ='markers', color =~state,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 1 observations
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Adding hoverinfocv_states_today_filter %>%plot_ly(x =~pop_density, y =~deathsper100k,type ='scatter', mode ='markers', color =~state,size =~population, sizes =c(5, 70), marker =list(sizemode='diameter', opacity=0.5),hoverinfo ='text',text =~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") , paste(" Deaths per 100k: ", deathsper100k, sep=""), sep ="<br>")) %>%layout(title ="Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",yaxis =list(title ="Deaths per 100k"), xaxis =list(title ="Population Density"),hovermode ="compare")
Warning: Ignoring 1 observations
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()
For pop_density vs. newdeathsper100k create a chart with the same variables using gglot_ly().Explore the pattern between \(x\) and \(y\) using geom_smooth().Explain what you see. Do you think pop_density is a correlate of newdeathsper100k?
There does seem to be a relationship of state-level population density with deaths, though this is really only apparent at higher population densities. At low densities there is high variability in death rates (probably politically influenced). County-level population density vs death rates may have more interesting results.
### FINISH p <-ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) +geom_point(aes(color=state)) +geom_smooth() +labs(title ="Deaths per 100,000 People by State Population Density",x ="Population Density",y ="Deaths per 100,000 People") +scale_color_discrete(name ="State, Size by\nTotal Population")ggplotly(p)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: size
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
8. Multiple line chart
Create a line chart of the naive_CFR for all states over time using plot_ly().Use the zoom and pan tools to inspect the naive_CFR for the states that had an increase in September. How have they changed over time?
In most states, CFR was stable over time. Exceptions include an increase in CFR for Kansas in April 2022, a peak in Texas on Sept 11 2022, and a very sharp and likely erroneous CFR of 118k in California in Nov 2022.
Create one more line chart, for Florida only, which shows new_cases and new_deaths together in one plot. Hint: use add_layer()Use hoverinfo to "eyeball" the approximate peak of deaths and peak of cases. What is the time delay between the peak of cases and the peak of deaths?
The peaks in deaths are so small it is hard to see the true peak cases on the screen at the same time as the peaks in deaths, but the largest peak in cases is on Jul 29 2022 with 193k cases, and the next peak in deaths comes on August 11 2022 with 649 (this date was also a peak case-reporting date). This reporting cycle with such high peaks and troughs on a daily basis and concurrent peaks in deaths/cases indicates there may be batch reporting of both cases and deaths occurring. However, the two week lag from peak cases to next peak of deaths (not same day) fits with our knowledge of the natural history of COVID pneumonia.
### FINISH# Line chart for naive_CFR for all states over time using `plot_ly()`plot_ly(cv_states, x =~date, y =~naive_CFR, color =~state, type ="scatter", mode ="lines")
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
### FINISH# Line chart for Florida showing new_cases and new_deaths togethercv_states %>%filter(state=="Florida") %>%arrange(date) %>%plot_ly(opacity =0.5) %>%add_lines(x =~date, y =~new_cases, type ="scatter", mode ="line", color =I("blue"), name ="New Cases") %>%add_lines(x =~date, y =~new_deaths, type ="scatter", mode ="line", color =I("red"), name ="New Deaths")
9. Heatmaps
Create a heatmap to visualize new_cases for each state on each date greater than June 1st, 2021. Start by mapping selected features in the dataframe into a matrix using the tidyr package function pivot_wider(), naming the rows and columns, as done in the lecture notes.
Use plot_ly() to create a heatmap out of this matrix. Which states stand out?
My matrix looks correct but the plot is only showing the new case rate for a three month window, no matter what I set the filtering date as. So I’ve truncated it down to the window that shows up – and the dates when I hover are October 2, 2022 for all states?? Florida has a stand out total case rate on this date.
Repeat with newper100k variable. Now which states stand out?
Same problem as above – but states with higher cases per 100,000 people include Connecticut, Florida, and Montana on Oct 2, 2022.
Create a second heatmap in which the pattern of new_cases for each state over time becomes more clear by filtering to only look at dates every two weeks.
This time we at least got a few different dates plotted. There is a stand-out point in Tennessee in December 2021…though the square is over July 2022 :(.
### FINISH# Map state, date, and new_cases to a matrixcv_states_mat <- cv_states %>%select(state, date, new_cases) %>%filter(date>="2021-06-01")cv_states_mat2 <-as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))str(cv_states_mat2)
'data.frame': 661 obs. of 53 variables:
$ date : Date, format: "2021-06-01" "2021-06-02" ...
$ Alabama : num NA NA NA NA NA NA 417 356 321 301 ...
$ Alaska : num 296 301 309 317 317 37 44 29 33 25 ...
$ Arizona : num 71 202 248 297 266 339 392 451 382 389 ...
$ Arkansas : num 389 424 452 488 517 150 159 182 176 190 ...
$ California : num 186 318 473 641 760 ...
$ Colorado : num 2358 2434 2553 2486 2497 ...
$ Connecticut : num 338 354 313 241 195 130 107 84 83 82 ...
$ Delaware : num 109 112 87 88 66 55 43 42 45 43 ...
$ District of Columbia: num 62 64 68 72 72 24 30 17 17 16 ...
$ Florida : num 865 1042 1310 1310 1310 ...
$ Georgia : num 34 102 171 240 289 327 366 406 410 404 ...
$ Hawaii : num 406 409 415 424 433 46 51 47 50 53 ...
$ Idaho : num 114 139 161 185 185 112 146 122 118 110 ...
$ Illinois : num 162 229 330 419 493 443 480 473 464 421 ...
$ Indiana : num 1293 1330 1504 1560 1620 ...
$ Iowa : num 7 30 48 63 76 87 91 97 88 82 ...
$ Kansas : num 240 301 301 350 350 110 141 144 124 125 ...
$ Kentucky : num 241 282 326 383 417 236 250 270 272 264 ...
$ Louisiana : num 446 537 582 328 328 328 442 317 336 358 ...
$ Maine : num 606 614 629 640 653 65 69 73 72 66 ...
$ Maryland : num 142 151 171 191 83 98 110 106 115 114 ...
$ Massachusetts : num 270 289 326 309 245 221 179 169 171 151 ...
$ Michigan : num 502 576 652 732 788 790 492 405 372 350 ...
$ Minnesota : num 872 891 924 960 149 186 214 213 214 203 ...
$ Mississippi : num 379 394 421 441 441 441 115 118 118 119 ...
$ Missouri : num 173 245 340 430 488 337 404 478 496 497 ...
$ Montana : num 374 397 415 411 106 99 80 97 84 80 ...
$ Nebraska : num 24 28 35 33 33 34 46 35 35 32 ...
$ Nevada : num 289 349 399 263 254 243 324 301 272 264 ...
$ New Hampshire : num 165 168 175 180 180 180 31 25 29 29 ...
$ New Jersey : num 109 148 188 179 228 258 254 260 247 256 ...
$ New Mexico : num 486 496 511 349 266 185 124 85 86 84 ...
$ New York : num 230 288 357 410 503 535 570 567 570 579 ...
$ North Carolina : num 1130 1077 1069 1061 822 ...
$ North Dakota : num 633 645 652 660 664 36 39 42 36 36 ...
$ Ohio : num 89 137 207 295 349 388 424 396 404 383 ...
$ Oklahoma : num 1012 1018 1035 1057 1057 ...
$ Oregon : num 311 365 395 463 502 533 264 283 277 290 ...
$ Pennsylvania : num 387 464 571 658 713 494 562 503 479 468 ...
$ Puerto Rico : num 752 754 758 769 780 45 54 54 53 56 ...
$ Rhode Island : num 285 236 200 182 173 112 47 37 37 36 ...
$ South Carolina : num 75 112 127 141 173 199 138 155 150 147 ...
$ South Dakota : num 284 289 292 294 10 10 12 16 12 13 ...
$ Tennessee : num 168 215 255 290 290 199 246 195 175 161 ...
$ Texas : num 585 921 1364 1598 1783 ...
$ Utah : num 1374 1402 1451 1492 1536 ...
$ Vermont : num 153 154 155 157 157 157 9 10 10 11 ...
$ Virginia : num 44 70 107 144 185 207 185 202 204 198 ...
$ Washington : num 519 627 750 725 760 709 752 664 641 626 ...
$ West Virginia : num 618 634 655 672 672 135 179 111 108 105 ...
$ Wisconsin : num 156 193 221 252 252 105 147 167 149 147 ...
$ Wyoming : num 527 537 552 564 305 217 90 69 73 67 ...
rownames(cv_states_mat2) <- cv_states_mat2$datecv_states_mat2$date <-NULLcv_states_mat2 <-as.matrix(cv_states_mat2)# Create a heatmap using plot_ly()plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),z=cv_states_mat2,type="heatmap",showscale=T) %>%layout(yaxis =list(range=c("2022-12-01","2023-04-30")))
# Create a second heatmap after filtering to only include dates every other weekfilter_dates <-seq(as.Date("2021-06-15"), as.Date("2023-11-01"), by=14)cv_states_mat <- cv_states %>%select(state, date, newper100k) %>%filter(date %in% filter_dates)cv_states_mat2 <-as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))rownames(cv_states_mat2) <- cv_states_mat2$datecv_states_mat2$date <-NULLcv_states_mat2 <-as.matrix(cv_states_mat2)# Create a heatmap using plot_ly()plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),z=~cv_states_mat2,type="heatmap",showscale=T)
10. Map
Create a map to visualize the naive_CFR by state on October 15, 2021. Compare with a map visualizing the naive_CFR by state on most recent date. Plot the two maps together using subplot(). Make sure the shading is for the same range of values (google is your friend for this)
Describe the difference in the pattern of the CFR.
The case-fatality rate on Nov 10, 2021, we see peak fatalities were in Rhode Island (92.5) and Illinois (71.4). Interestingly, the case-fatality rates today are higher, with peaks in Virginia (102.8) Florida (79.8), Tennessee (82.2), Hawaii (81.2), and Illinois (83.5). The distribution of CFR has higher death rates in the East/South for both days, but there are actually more states with high CFR today than during “peak” pandemic times.
### For specified datepick.date ="2021-11-10"# Extract the data for each state by its abbreviationcv_per100 <- cv_states %>%filter(date==pick.date) %>%select(state, abb, newper100k, cases, deaths) # select datacv_per100$state_name <- cv_per100$statecv_per100$state <- cv_per100$abbcv_per100$abb <-NULL# Create hover textcv_per100$hover <-with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))# Set up mapping detailsset_map_details <-list(scope ='usa',projection =list(type ='albers usa'),showlakes =TRUE,lakecolor =toRGB('white'))# Make sure both maps are on the same color scaleshadeLimit <-125# Create the mapfig <-plot_geo(cv_per100, locationmode ='USA-states') %>%add_trace(z =~newper100k, text =~hover, locations =~state,color =~newper100k, colors ='Purples' )fig <- fig %>%colorbar(title =paste0("Cases per 100k: ", pick.date), limits =c(0,shadeLimit))fig <- fig %>%layout(title =paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),geo = set_map_details )fig_pick.date <- fig################ Map for today's date# Extract the data for each state by its abbreviationcv_per100 <- cv_states_today %>%select(state, abb, newper100k, cases, deaths) # select datacv_per100$state_name <- cv_per100$statecv_per100$state <- cv_per100$abbcv_per100$abb <-NULL# Create hover textcv_per100$hover <-with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))# Set up mapping detailsset_map_details <-list(scope ='usa',projection =list(type ='albers usa'),showlakes =TRUE,lakecolor =toRGB('white'))# Create the mapfig <-plot_geo(cv_per100, locationmode ='USA-states') %>%add_trace(z =~newper100k, text =~hover, locations =~state,color =~newper100k, colors ='Purples' )fig <- fig %>%colorbar(title =paste0("Cases per 100k: ", Sys.Date()), limits =c(0,shadeLimit))fig <- fig %>%layout(title =paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),geo = set_map_details )fig_Today <- figfig_Today
### Plot together subplot(fig_pick.date, fig_Today, nrows =2, margin = .05)